# -*- coding: utf-8 -*-
#!/usr/bin/env python

# raw data conversion
# from
# Label as folder name, one file indicates one item of sentence or document under this folder.
# to
# <Label><tab><sentence>

import os


def Convert2SVMData(_dir,label,_output_file, validate_rate=0.2):

    all_context=[]
    for filename in os.listdir(_dir):
        one_f = open(_dir + "/" + filename,"r")
        context = one_f.read()
        all_context.append(context.replace("\n","").replace("\r","").replace(" ","").replace("\t",""))
        one_f.close()

    splited_line_index = int(len(all_context) * validate_rate)

    f_output = open(_output_file+".test", "a")
    for i in range(0,splited_line_index):
        f_output.write(str(label)+" "+ all_context[i] + "\n")
    f_output.close()

    f_output = open(_output_file+".train", "a")
    for i in range(splited_line_index,len(all_context)):
        f_output.write(str(label)+"\t"+ all_context[i] + "\n")
    f_output.close()
    pass

def preprocess():
    #step1
    Convert2SVMData("../dataset/raw/Auto", 0,"../dataset/svm/corpus.1.txt",0)
    Convert2SVMData("../dataset/raw/Culture", 1, "../dataset/svm/corpus.1.txt",0)
    Convert2SVMData("../dataset/raw/Economy", 2, "../dataset/svm/corpus.1.txt",0)
    Convert2SVMData("../dataset/raw/Medicine", 3, "../dataset/svm/corpus.1.txt",0)
    Convert2SVMData("../dataset/raw/Military", 4, "../dataset/svm/corpus.1.txt",0)
    Convert2SVMData("../dataset/raw/Sports", 5, "../dataset/svm/corpus.1.txt",0)


if __name__ == "__main__":
    preprocess()
